Aluno: Felipe Martim Fernandes Vieira
Este portfólio demonstra o que aprendi e sou capaz de fazer utilizando a linguagem R como ferramenta estatística e de modelagem de inteligência artificial.
install.packages("dplyr")
library(dplyr)
install.packages("caret")
library(caret)
vetor <- c(1,2,3,4,5,6,7, 8, 9)
matriz <- matrix(vetor , nrow = 3)
vetor
[1] 1 2 3 4 5 6 7 8 9
vetor[1:3]
[1] 1 2 3
vetor[vetor < 6]
[1] 1 2 3 4 5
matriz
[,1] [,2] [,3]
[1,] 1 4 7
[2,] 2 5 8
[3,] 3 6 9
matriz[1, 2]
[1] 4
matriz[, 3]
[1] 7 8 9
library(lubridate)
data <- ymd("20190731")
um.ano.depois <- data + years(1)
um.ano.depois
[1] "2020-07-31"
normal <- rnorm(100, mean = 5, sd = 2)
normal
[1] 3.1277659 0.1684599 2.7418471 -0.2346442 6.2085593 5.5716705 4.0744676 6.8028021 7.2166787
[10] 4.0338440 4.6780264 4.4888754 5.4182834 4.2039965 4.9063195 7.3497871 3.0388679 4.1575199
[19] 3.3326817 4.5261037 8.0192970 4.2822641 2.1249156 5.3805223 5.1970747 5.5682728 5.8932552
[28] 4.7527468 8.6907711 5.9201752 5.1779752 5.2843140 5.0133811 3.5442119 3.7282831 5.2485034
[37] 4.8679826 5.3088418 6.6768776 1.8172700 5.8288302 8.0845784 5.4326765 5.2576235 4.3615531
[46] 6.0309208 4.2384613 7.0101065 5.2480359 7.6080098 5.0803198 4.9606184 4.1293023 6.9487503
[55] 2.5170328 3.5122507 4.7964694 4.0627517 4.3491371 3.0958915 2.2625576 8.0066741 5.6618914
[64] 4.8169608 4.2531599 3.0105086 5.1646673 4.5956364 7.1907168 2.8023568 5.4818073 7.8501698
[73] 3.9416546 8.1842647 3.7772834 3.7946262 5.5471808 5.0398361 4.9628027 5.5912824 3.2937300
[82] 6.7017975 7.4043407 7.5052404 0.7527118 6.0102035 3.3182674 9.2304810 6.1407086 4.4644801
[91] 3.0580821 8.4015841 6.2807730 3.0274393 4.9521353 4.4259306 4.4344114 4.2093162 2.0083353
[100] 7.5737060
binomial <- rbinom(100, 10, prob = 0.7)
binomial
[1] 9 7 8 7 5 8 9 7 9 8 9 6 7 9 7 7 8 5 7 7 8 7 5 9 7 5 6 6 7 8 6 6 9 8 7
[36] 5 8 5 9 5 6 5 9 5 7 7 8 9 4 8 9 5 8 8 6 8 9 5 10 8 8 6 7 4 9 7 8 6 6 7
[71] 8 7 9 3 9 9 6 7 7 9 4 5 5 7 8 8 6 6 6 4 8 9 8 10 8 7 8 10 5 6
hist(normal)
hist(binomial)
boxplot(normal)
boxplot(binomial)
data(mtcars)
summary(mtcars)
mpg cyl disp hp drat wt
Min. :10.40 Min. :4.000 Min. : 71.1 Min. : 52.0 Min. :2.760 Min. :1.513
1st Qu.:15.43 1st Qu.:4.000 1st Qu.:120.8 1st Qu.: 96.5 1st Qu.:3.080 1st Qu.:2.581
Median :19.20 Median :6.000 Median :196.3 Median :123.0 Median :3.695 Median :3.325
Mean :20.09 Mean :6.188 Mean :230.7 Mean :146.7 Mean :3.597 Mean :3.217
3rd Qu.:22.80 3rd Qu.:8.000 3rd Qu.:326.0 3rd Qu.:180.0 3rd Qu.:3.920 3rd Qu.:3.610
Max. :33.90 Max. :8.000 Max. :472.0 Max. :335.0 Max. :4.930 Max. :5.424
qsec vs am gear carb
Min. :14.50 Min. :0.0000 Min. :0.0000 Min. :3.000 Min. :1.000
1st Qu.:16.89 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:3.000 1st Qu.:2.000
Median :17.71 Median :0.0000 Median :0.0000 Median :4.000 Median :2.000
Mean :17.85 Mean :0.4375 Mean :0.4062 Mean :3.688 Mean :2.812
3rd Qu.:18.90 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:4.000 3rd Qu.:4.000
Max. :22.90 Max. :1.0000 Max. :1.0000 Max. :5.000 Max. :8.000
head(mtcars)
ids <- seq_len(nrow(mtcars))
set.seed(132)
amostra <- sample(ids, 6)
mtcars[amostra, ]
library(plotly)
plotWithTrendLine <- function (data, x, y) {
fit <- data %>% lm(.[,y]~.[,x], data=.)
p <- plot_ly(data = data, x = data[, x], y = data[, y], type="scatter", mode='markers',
colors = c('#FF9999', 'lightblue'), opacity = 0.8) %>%
add_trace(x=data[, x], y=predict(fit), mode = 'lines', line = list(color = 'lightblue'),
showlegend = F, hoverinfo="none", inherit = FALSE) %>%
layout(title = paste(y, 'vs', x, sep = ' '),
xaxis = list(title = x, showline = T),
yaxis = list(side = 'left', title = y))
print(p)
}
plotWithTrendLine(mtcars, 'mpg', 'disp')
NULL
file.url = 'https://poenaroda.com.br/wp-content/uploads/2019/02/destaque_alanturing.jpg'
file.local = file.path('./', basename(file.url))
download.file(url = file.url, destfile = file.local , mode='wb')
trying URL 'https://poenaroda.com.br/wp-content/uploads/2019/02/destaque_alanturing.jpg'
Content type 'image/jpeg' length 51027 bytes (49 KB)
==================================================
downloaded 49 KB
setwd('~/workspace/fiap/wine')
The working directory was changed to /Users/felipe/workspace/fiap/wine inside a notebook chunk. The working directory will be reset when the chunk is finished running. Use the knitr root.dir option in the setup chunk to change the working directory for notebook chunks.
wines <- read.csv2(file="BaseWine_Red_e_White.csv"
, header=TRUE
, sep=";")
wines_padr <- preProcess(wines_adjusted[,1:11], c("center", "scale")) %>%
predict(., wines_adjusted) %>%
data.frame(trans = .)
colnames(wines_padr) <- colnames(wines_adjusted)
str(wines_padr)
'data.frame': 6497 obs. of 13 variables:
$ fixedacidity : num -0.475 -0.397 2.611 -1.4 -0.397 ...
$ volatileacidity : num -0.60537 0.00203 -0.18019 -0.96981 -0.24093 ...
$ citricacid : num 0.216 0.766 1.179 -0.541 0.835 ...
$ residualsugar : num 0.478 -0.813 -0.686 -0.135 2.817 ...
$ chlorides : num -0.7146 -0.4291 0.1988 -0.4291 0.0276 ...
$ freesulfurdioxide : num 0.3084 -0.0859 -0.7057 -0.0296 1.9423 ...
$ totalsulfurdioxide: num 0.3407 -0.0309 -1.3401 -0.0486 1.9153 ...
$ density : num -0.3019 -1.5394 0.983 -0.0821 1.6457 ...
$ pH : num -0.1773 0.0715 -0.4882 1.2532 -0.6748 ...
$ sulphates : num -1.08375 -0.61334 -0.14293 -0.88214 -0.00852 ...
$ alcohol : num 0.0114 1.739 -0.5645 -0.8936 -1.1404 ...
$ quality : int 5 6 6 6 5 5 4 6 7 6 ...
$ type : Factor w/ 2 levels "RED","WHITE": 2 2 1 2 2 1 1 1 2 2 ...
summary(wines_padr)
fixedacidity volatileacidity citricacid residualsugar chlorides
Min. :-2.6344 Min. :-1.5772 Min. :-2.19266 Min. :-1.0243 Min. :-1.3425
1st Qu.:-0.6289 1st Qu.:-0.6661 1st Qu.:-0.47230 1st Qu.:-0.7704 1st Qu.:-0.5148
Median :-0.1661 Median :-0.3017 Median :-0.05941 Median :-0.5164 Median :-0.2579
Mean : 0.0000 Mean : 0.0000 Mean : 0.00000 Mean : 0.0000 Mean : 0.0000
3rd Qu.: 0.3739 3rd Qu.: 0.3665 3rd Qu.: 0.49111 3rd Qu.: 0.5629 3rd Qu.: 0.2559
Max. : 6.6989 Max. : 7.5338 Max. : 9.23057 Max. : 8.5416 Max. :15.8410
freesulfurdioxide totalsulfurdioxide density pH sulphates
Min. :-1.66345 Min. :-1.9416 Min. :-2.56383 Min. :-3.10038 Min. :-2.0918
1st Qu.:-0.76202 1st Qu.:-0.6855 1st Qu.:-0.79551 1st Qu.:-0.67481 1st Qu.:-0.6805
Median :-0.08594 Median : 0.0399 Median : 0.06668 Median :-0.05287 Median :-0.1429
Mean : 0.00000 Mean : 0.0000 Mean : 0.00000 Mean : 0.00000 Mean : 0.0000
3rd Qu.: 0.59014 3rd Qu.: 0.7122 3rd Qu.: 0.77672 3rd Qu.: 0.63126 3rd Qu.: 0.4619
Max. :14.56245 Max. : 5.7368 Max. : 6.52124 Max. : 4.92265 Max. : 9.8701
alcohol quality type
Min. :-7.8396 Min. :3.000 RED :1599
1st Qu.:-0.8113 1st Qu.:5.000 WHITE:4898
Median :-0.1532 Median :6.000
Mean : 0.0000 Mean :5.818
3rd Qu.: 0.6695 3rd Qu.:6.000
Max. : 3.6311 Max. :9.000
wines_adjusted <- wines %>% select(-id_vinho) %>% rename(type = Vinho)
head(wines_adjusted)
wines_adjusted %>%
ggplot(aes(x = density, y = fixedacidity, color = type)) +
geom_point(alpha = 0.2, size = 2) +
geom_smooth(method = 'lm')
plot(airquality$Wind, airquality$Ozone)
set.seed(1232)
output_cluster <- kmeans(wines_padr, 6, iter=100)
output_cluster
K-means clustering with 6 clusters of sizes 946, 1206, 1483, 1219, 646, 997
Cluster means:
fixedacidity volatileacidity citricacid residualsugar chlorides freesulfurdioxide totalsulfurdioxide
1 0.08473360 1.6866294 -1.26213820 -0.6317944 0.6817667 -0.7986723 -1.16915557
2 -0.52520677 -0.2668938 -0.02511597 -0.4822657 -0.5802020 -0.1213941 -0.16797325
3 -0.17028943 -0.3492428 0.31064945 1.4659004 -0.1443988 0.9372052 0.99798842
4 0.09639664 -0.4497086 0.23995436 -0.2777307 -0.2107031 -0.2721956 0.07332153
5 2.00659014 0.5019104 0.95725823 -0.5616504 1.2624821 -0.8942313 -1.25067366
6 -0.60981357 -0.5333900 -0.14775699 -0.2941430 -0.2906727 0.4228183 0.54878266
density pH sulphates alcohol
1 0.5037673 0.96129972 0.40065402 -0.24807743
2 -1.3589390 -0.04482517 -0.30296210 1.39885650
3 0.9223711 -0.49867076 -0.27728431 -0.84938155
4 -0.4284112 -0.72419807 -0.50528600 -0.08757578
5 1.0021260 -0.06731150 1.42050025 0.02119780
6 -0.3316950 0.81291779 0.09615516 -0.09994560
Clustering vector:
[1] 4 2 5 6 3 1 1 1 2 3 3 6 6 6 4 2 1 4 1 6 6 3 1 4 4 4 6 5 1 3 2 1 5 4 1 6 6 3 2 3 3 3 6 5 2 3 4 3 2 2 4
[52] 2 6 6 1 6 4 6 1 1 2 3 4 4 1 2 4 5 2 3 6 4 3 1 4 5 5 6 2 3 2 2 5 5 5 1 2 3 1 2 5 6 2 2 3 2 3 4 2 2 1 4
[103] 2 4 6 1 4 2 4 6 3 4 4 4 4 3 2 4 6 2 4 6 5 3 6 3 4 6 1 1 6 5 6 2 3 1 1 3 4 3 3 3 2 3 2 2 5 3 1 1 6 4 4
[154] 2 1 6 3 3 5 3 3 6 3 5 2 2 1 4 6 4 1 1 3 3 2 6 2 1 6 3 2 1 4 1 4 6 3 6 1 3 4 1 3 2 5 2 6 3 2 3 4 2 3 6
[205] 3 6 3 6 5 3 1 2 6 5 5 6 3 2 4 3 3 3 4 4 3 1 1 4 2 3 1 2 4 3 1 4 3 1 5 3 5 1 1 3 2 6 6 6 3 2 4 6 2 2 4
[256] 3 5 3 3 3 3 1 1 2 1 2 2 6 4 3 2 4 3 3 1 5 2 3 5 1 2 3 5 1 6 5 2 5 3 2 3 6 2 2 1 6 1 3 5 6 3 3 1 5 4 3
[307] 2 1 1 3 4 1 5 1 2 3 1 4 1 1 4 6 6 3 6 2 4 1 1 3 5 4 3 3 5 1 4 3 5 3 2 5 4 3 6 6 1 5 4 2 2 4 4 6 4 4 3
[358] 2 6 6 4 2 3 2 6 1 2 3 2 4 3 4 3 1 4 1 2 6 4 6 5 5 3 6 3 1 2 4 2 2 1 2 6 5 3 6 5 1 4 4 1 4 3 4 2 3 6 2
[409] 5 1 4 4 2 6 3 6 4 3 5 1 2 3 3 3 3 3 2 6 2 2 6 1 2 2 4 1 3 4 2 2 3 4 3 3 3 3 1 6 1 1 6 4 4 4 6 3 3 2 4
[460] 4 2 3 1 6 6 4 2 6 4 2 2 2 2 4 3 4 6 3 6 2 4 5 2 2 6 6 3 2 4 3 4 4 2 4 2 1 3 4 5 3 2 4 6 2 4 6 5 6 4 6
[511] 5 4 1 6 3 3 5 6 2 3 1 1 3 6 6 4 6 3 6 4 6 2 2 1 6 2 1 3 1 6 3 4 1 4 2 5 6 6 1 4 3 3 3 6 2 6 5 2 5 6 4
[562] 2 3 3 6 2 3 6 2 5 4 4 6 2 4 4 3 3 3 2 6 5 2 4 3 3 5 2 1 3 4 1 1 3 1 2 3 3 2 3 4 3 3 3 5 1 1 1 1 4 3 4
[613] 4 4 5 3 2 1 6 5 2 2 4 4 2 1 1 1 2 2 3 5 1 6 3 3 4 4 3 5 2 3 4 3 2 2 6 6 3 3 3 2 4 5 2 2 1 3 2 5 5 2 2
[664] 5 5 3 6 4 2 1 6 3 4 2 4 2 3 1 4 3 1 4 5 3 3 2 1 4 3 4 1 1 2 2 1 2 3 3 3 3 2 5 4 2 1 2 4 3 2 1 6 1 2 4
[715] 3 4 3 4 2 6 3 3 6 3 3 6 6 4 6 4 4 1 2 6 3 3 6 2 5 5 3 4 2 4 1 2 2 1 2 5 5 1 6 5 6 6 2 5 1 1 6 1 4 2 1
[766] 2 5 3 6 1 3 1 3 6 2 5 2 4 4 2 3 4 4 2 1 6 2 6 6 1 1 5 6 4 5 4 3 1 4 2 2 4 6 4 1 6 3 1 1 4 4 3 3 5 3 6
[817] 5 4 6 6 1 1 5 1 6 4 3 1 2 4 4 6 3 3 4 4 4 3 4 4 4 1 3 6 3 3 3 6 2 5 5 6 4 3 4 2 4 2 3 1 2 2 2 5 6 4 1
[868] 6 6 3 4 3 4 6 5 4 6 2 6 5 1 1 2 1 2 4 5 6 3 4 2 1 2 2 5 6 6 1 2 4 1 2 3 2 3 3 3 1 3 4 1 2 4 3 6 5 3 1
[919] 5 2 2 4 1 4 1 4 1 2 4 6 6 1 3 2 2 6 5 3 6 4 5 1 4 5 1 6 1 1 6 6 2 2 2 6 4 2 1 2 3 2 2 2 3 3 5 6 3 6 5
[970] 5 3 5 2 1 1 2 6 4 2 2 2 4 5 3 3 6 6 2 6 1 6 1 4 3 2 6 5 6 3 4
[ reached getOption("max.print") -- omitted 5497 entries ]
Within cluster sum of squares by cluster:
[1] 5121.289 4905.915 7942.310 5922.045 7546.492 4797.123
(between_SS / total_SS = 49.3 %)
Available components:
[1] "cluster" "centers" "totss" "withinss" "tot.withinss" "betweenss" "size"
[8] "iter" "ifault"
table(output_cluster$cluster)
1 2 3 4 5 6
946 1206 1483 1219 646 997